We are using the “Now That’s What I Call Music(U.S. releases)” data set from Kaggle to predict the danceability of a song. Danceability is described with how much a song will get people todance from the tempo, rhythm stability, beat, and regulatiry. The scale of danceability ranges from 0 to 1, 0 being the least danceable while 1 is the most danceable song.
Looking at the data there does not seem to be any null data within the database. From looking at the database it deo
library(rpart)
library(rpart.plot)
library(maptree)
## Loading required package: cluster
library(e1071)
library(cluster)
# the following utility files can be found attached to the assignment
source("https://raw.githubusercontent.com/grbruns/cst383/master/lin-regr-util.R")
source("https://raw.githubusercontent.com/grbruns/cst383/master/class-util.R")
dat=read.csv("data.csv")
sum(is.null(dat))
## [1] 0
dat$X = NULL
dat$loudness=abs(dat$loudness)
dat$song_title = NULL
dat$instrumentalness = NULL
dat$danceability=dat$danceability*100
dat$energy=dat$energy*100
dat$acousticness=dat$acousticness*100
dat$liveness=dat$liveness*100
dat$valence=dat$valence*100
dat$speechiness=dat$speechiness*100
dat$song_title=NULL
summary(dat)
## acousticness danceability duration_ms energy
## Min. : 0.00028 Min. :12.20 Min. : 16042 Min. : 1.48
## 1st Qu.: 0.96300 1st Qu.:51.40 1st Qu.: 200015 1st Qu.:56.30
## Median : 6.33000 Median :63.10 Median : 229261 Median :71.50
## Mean :18.75900 Mean :61.84 Mean : 246306 Mean :68.16
## 3rd Qu.:26.50000 3rd Qu.:73.80 3rd Qu.: 270333 3rd Qu.:84.60
## Max. :99.50000 Max. :98.40 Max. :1004627 Max. :99.80
##
## key liveness loudness mode
## Min. : 0.000 Min. : 1.88 Min. : 0.307 Min. :0.0000
## 1st Qu.: 2.000 1st Qu.: 9.23 1st Qu.: 4.746 1st Qu.:0.0000
## Median : 6.000 Median :12.70 Median : 6.248 Median :1.0000
## Mean : 5.343 Mean :19.08 Mean : 7.086 Mean :0.6123
## 3rd Qu.: 9.000 3rd Qu.:24.70 3rd Qu.: 8.394 3rd Qu.:1.0000
## Max. :11.000 Max. :96.90 Max. :33.097 Max. :1.0000
##
## speechiness tempo time_signature valence
## Min. : 2.310 Min. : 47.86 Min. :1.000 Min. : 3.48
## 1st Qu.: 3.750 1st Qu.:100.19 1st Qu.:4.000 1st Qu.:29.50
## Median : 5.490 Median :121.43 Median :4.000 Median :49.20
## Mean : 9.266 Mean :121.60 Mean :3.968 Mean :49.68
## 3rd Qu.:10.800 3rd Qu.:137.85 3rd Qu.:4.000 3rd Qu.:69.10
## Max. :81.600 Max. :219.33 Max. :5.000 Max. :99.20
##
## target artist
## Min. :0.0000 Drake : 16
## 1st Qu.:0.0000 Rick Ross : 13
## Median :1.0000 Disclosure : 12
## Mean :0.5057 Backstreet Boys: 10
## 3rd Qu.:1.0000 WALK THE MOON : 10
## Max. :1.0000 Crystal Castles: 9
## (Other) :1947
fit=lm(danceability ~ acousticness+energy+liveness+loudness+speechiness+valence, data=dat)
summary(fit)
##
## Call:
## lm(formula = danceability ~ acousticness + energy + liveness +
## loudness + speechiness + valence, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.345 -8.519 0.648 9.920 42.729
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 72.33759 2.48554 29.103 < 2e-16 ***
## acousticness -0.16090 0.01555 -10.349 < 2e-16 ***
## energy -0.27008 0.02534 -10.659 < 2e-16 ***
## liveness -0.09729 0.02030 -4.792 1.77e-06 ***
## loudness -0.58480 0.12637 -4.628 3.93e-06 ***
## speechiness 0.21665 0.03420 6.335 2.92e-10 ***
## valence 0.30039 0.01295 23.202 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.68 on 2010 degrees of freedom
## Multiple R-squared: 0.2809, Adjusted R-squared: 0.2788
## F-statistic: 130.9 on 6 and 2010 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(density(dat$danceability),col="red",main="Density of Features", ylim = c(0,.07))
lines(density(dat$energy),col="green")
lines(density(dat$liveness),col="blue")
legend("topright",legend=c("Dance","Energy","Liveness"),col=c("Red","Green","Blue"),lty=1,cex=.7)
plot(ecdf(dat$danceability),col="red",main="ECDF of Features")
lines(ecdf(dat$energy),col="green")
lines(ecdf(dat$liveness),col="blue")
legend("bottomright",legend=c("Dance","Energy","Liveness"),col=c("Red","Green","Blue"),lty=1,cex=.7)
plot(danceability ~ energy, data=dat, col="red",main="Danceability by Energy")
plot(danceability ~ liveness, data=dat, col="red",main="Danceability by Liveness")
set.seed(123)
split = split_data(dat)
tr_dat = split[[1]]
te_dat = split[[2]]
fit1=lm(danceability ~ energy+liveness, data=tr_dat)
predicts=predict(fit1, te_dat)
actual=te_dat$danceability
rng=range(c(predicts,actual))
plot(actual ~ predicts, pch=20, main="Predicted Danceablity Vs. Actual",col="red")
lines(c(rng[1],rng[2]),c(rng[1],rng[2]),lty=2,col="blue",lwd=1.5)
par(mfrow=c(2,2))
plot(fit1)
summary(fit1)
##
## Call:
## lm(formula = danceability ~ energy + liveness, data = tr_dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.525 -10.447 0.845 12.048 36.668
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 61.40806 1.40251 43.784 < 2e-16 ***
## energy 0.04211 0.01987 2.120 0.0342 *
## liveness -0.13114 0.02637 -4.974 7.33e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.81 on 1509 degrees of freedom
## Multiple R-squared: 0.01689, Adjusted R-squared: 0.01559
## F-statistic: 12.96 on 2 and 1509 DF, p-value: 2.614e-06
mse=mean((actual-predicts)^2)
rmse=sqrt(mse)
cat("\nRoot Mean Square: ",rmse)
##
## Root Mean Square: 16.30534
te_dat$artist=NULL
te_dat$key=NULL
te_dat$mode=NULL
te_dat$target=NULL
te_dat$time_signature=NULL
te_dat$duration_ms=NULL
te_dat$tempo=NULL
par(mfrow=c(1,2))
hc=hclust(dist(te_dat),method="complete")
clusters=cutree(hc,10)
cols=rainbow(10)[clusters]
plot(danceability ~ energy,data=te_dat,col=cols,pch=16,main="Danceability by Energy, n=10")
plot(danceability ~ liveness,data=te_dat,col=cols,pch=16,main="Danceability by Liveness, n=10")
fit2=kmeans(te_dat,10)
cat("Betweenss of fit: ",fit2$betweenss)
## Betweenss of fit: 875822.6
cat("Totss of fit: ",fit2$totss)
## Totss of fit: 1193276
cat("Betweenss/Totss", (fit2$betweenss/fit2$totss))
## Betweenss/Totss 0.7339648
sil=silhouette(fit2$cluster,dist(te_dat))
cat("Silhouette Width", mean(sil[,3]))
## Silhouette Width 0.2102025
hc=hclust(dist(te_dat),method="complete")
clusters=cutree(hc,100)
cols=rainbow(100)[clusters]
plot(danceability ~ energy,data=te_dat,col=cols,pch=16,main="Danceability by Energy, n=100")
plot(danceability ~ liveness,data=te_dat,col=cols,pch=16,main="Danceability by Liveness, n=100")
fit3=kmeans(te_dat,100)
cat("Betweenss of fit: ",fit3$betweenss)
## Betweenss of fit: 1113266
cat("Totss of fit: ",fit3$totss)
## Totss of fit: 1193276
cat("Betweenss/Totss", (fit3$betweenss/fit3$totss))
## Betweenss/Totss 0.9329497
sil2=silhouette(fit3$cluster,dist(te_dat))
cat("Silhouette Width",mean(sil2[,3]))
## Silhouette Width 0.209537
fit = rpart(danceability ~ energy +liveness, data = tr_dat)
prp(fit, extra=1, varlen=-10, main="Regression Tree for Danceability (tr_dat)",box.col= "green")
predicted = predict(fit, te_dat)
errors = te_dat$danceability - predicted
rmse = sqrt(mean(errors^2))
plot_predict_actual(predicted, actual, 2, "Regression Tree Danceablility Prediction")
hist(errors, col = "red4", main = "Histogram of Errors, Regression Tree")
par(mfrow=c(2,2))
plot(density(dat$danceability),col="red",main="Density of Features")
lines(density(dat$energy),col="green")
lines(density(dat$valence),col="blue")
legend("topright",legend=c("Dance","Energy","Valence"),col=c("Red","Green","Blue"),lty=1,cex=.7)
plot(ecdf(dat$danceability),col="red",main="ECDF of Features")
lines(ecdf(dat$energy),col="green")
lines(ecdf(dat$valence),col="blue")
legend("bottomright",legend=c("Dance","Energy","Valence"),col=c("Red","Green","Blue"),lty=1,cex=.7)
plot(danceability ~ energy, data=dat, col="red",main="Danceability by Energy")
plot(danceability ~ valence, data=dat, col="red",main="Danceability by Valence")
fit4 = lm(danceability ~ energy+valence, data=tr_dat)
predicts1 = predict(fit4, newdata=te_dat, type="response")
actual1=te_dat$danceability
rang = range(c(predicts1, actual1))
par(mfrow=c(1,1))
plot(predicts1 ~ actual1, main="Predicting Danceability using Energy and Valence", ylab = "Prediction", xlab="Actuals", col="red", pch=20)
lines(c(rang[1], rang[2]),
c(rang[1], rang[2]), col="blue",lty=2)
par(mfrow=c(2,2))
plot(fit4)
mse1=mean((actual1-predicts1)^2)
rmse1=sqrt(mse1)
cat("Root Mean Square: ",rmse1)
## Root Mean Square: 14.6752
From the graphs exploring the tree of the data of Valence and Energy to danceability using testing data and training and seeing the comparison of them.
par(mfrow=c(1,2))
hc=hclust(dist(te_dat),method="complete")
clusters=cutree(hc,10)
cols=rainbow(10)[clusters]
plot(danceability ~ energy,data=te_dat,col=cols,pch=16,main="Danceability by Energy, n=10")
plot(danceability ~ valence,data=te_dat,col=cols,pch=16,main="Danceability by Valence, n=10")
fit2=kmeans(te_dat,10)
cat("Betweenss of fit: ",fit2$betweenss)
## Betweenss of fit: 876718.2
cat("Totss of fit: ",fit2$totss)
## Totss of fit: 1193276
cat("Betweenss/Totss", (fit2$betweenss/fit2$totss))
## Betweenss/Totss 0.7347154
sil=silhouette(fit2$cluster,dist(te_dat))
cat("Silhouette Width", mean(sil[,3]))
## Silhouette Width 0.2158532
hc=hclust(dist(te_dat),method="complete")
clusters=cutree(hc,100)
cols=rainbow(100)[clusters]
plot(danceability ~ energy,data=te_dat,col=cols,pch=16,main="Danceability by Energy, n=100")
plot(danceability ~ valence,data=te_dat,col=cols,pch=16,main="Danceability by Valence, n=100")
fit3=kmeans(te_dat,100)
cat("Betweenss of fit: ",fit3$betweenss)
## Betweenss of fit: 1112912
cat("Totss of fit: ",fit3$totss)
## Totss of fit: 1193276
cat("Betweenss/Totss", (fit3$betweenss/fit3$totss))
## Betweenss/Totss 0.9326528
sil2=silhouette(fit3$cluster,dist(te_dat))
cat("Silhouette Width",mean(sil2[,3]))
## Silhouette Width 0.2038265
fit5 = rpart(danceability ~ energy +valence, data = tr_dat)
prp(fit5, extra=1, varlen=-10, main="Regression Tree for Danceability (tr_dat)",box.col= "green")
fit6 = rpart(danceability ~ energy +valence, data = te_dat)
prp(fit6, extra=1, varlen=-10, main="Regression Tree for Danceability (te_dat)",box.col= "green")
predicted = predict(fit5, te_dat)
errors = te_dat$danceability - predicted
rmse = sqrt(mean(errors^2))
plot_predict_actual(predicted, actual, 2, "Regression Tree Danceablility Prediction")
hist(errors, col = "red4", main = "Histogram of Errors, Regression Tree")
dat$dance=ifelse(dat$danceability>70,1,0)
fit=glm(dance ~ energy + valence, data=dat,family=binomial)
summary(fit)
##
## Call:
## glm(formula = dance ~ energy + valence, family = binomial, data = dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6541 -0.8792 -0.6233 1.0968 2.3357
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.393615 0.181184 -7.692 1.45e-14 ***
## energy -0.015742 0.002607 -6.039 1.55e-09 ***
## valence 0.034094 0.002316 14.724 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2579.5 on 2016 degrees of freedom
## Residual deviance: 2324.9 on 2014 degrees of freedom
## AIC: 2330.9
##
## Number of Fisher Scoring iterations: 4
#dat$dance=as.factor(dat$dance)
split = split_data(dat)
tr_dat = split[[1]]
te_dat = split[[2]]
fit=glm(dance ~ energy + valence,data=tr_dat,family=binomial)
y=predict(fit,newdata=te_dat,type="response")
predicts=as.numeric(y>.7)
actuals=te_dat$dance
table(predicts,actuals)
## actuals
## predicts 0 1
## 0 314 181
## 1 0 10
cat("\nAccuracy of predictions: ",mean(predicts==actuals))
##
## Accuracy of predictions: 0.6415842
fit1=glm(dance ~ energy, data=dat, family=binomial)
fit2=glm(dance ~ valence, data=dat, family=binomial)
temp1=data.frame(energy=seq(min(dat$energy),max(dat$energy),10))
temp2=data.frame(valence=seq(min(dat$valence),max(dat$valence),10))
probs1=predict(fit1,newdata=temp1,type="response")
probs2=predict(fit2,newdata=temp2,type="response")
plot(dance ~ energy,data=tr_dat,col="red",pch=20,ylab="P(danceability)")
lines(temp1$energy,probs1,col="blue",lwd=2)
plot(dance ~ valence,data=tr_dat,col="red",pch=20,ylab="P(danceability)")
lines(temp2$valence,probs2,col="blue",lwd=2)
fit.1=glm(dance ~ energy + valence, data=tr_dat,family=binomial)
predicts=predict(fit.1,te_dat,type="response")
actuals=te_dat$dance
plot(density(predicts[actuals==1]),col="red",xlab="Logistic Regression output",main="Double Density of Energy and Valence",ylim=c(0,3.0))
legend("topright",col=c("red","blue"),legend=c("Danceable","Not Danceable"),lty=1)
lines(density(predicts[actuals==0]),col="blue",lwd=2)